# -*- coding: utf-8 -*-
"""
Created on Wed Dec 26 20:57:49 2018

@author: MaYuling
"""

import requests
import time
from requests.exceptions import RequestException
import pandas as pd
import re

def get_one_page(url):
    try:
        headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
        response=requests.get(url=url,headers=headers,timeout=(3,5))
        if response.status_code==200:
            return response.text
        return None
    except RequestException:
        wrongid.append(j)
        pass
                                                  
def parse_one_page(html):
    try:
        houseinfo=[]
        position='成都二手房成交价格.*?">(.*?)二手房成交价格</a>.*?">(.*?)二手房成交价格</a>'
        msg='msg">.*?<label>(.*?)</label>挂牌价格.*?<label>(.*?)</label>成交周期.*?<label>(.*?)</label>调价.*?<label>(.*?)</label>带看.*?<label>(.*?)</label>关注.*?<label>(.*?)</label>浏览'
        baseinfos='房屋户型</span>(.*?)</li>.*?所在楼层</span>(.*?)</li>.*?建筑面积</span>(.*?)</li>.*?户型结构</span>(.*?)</li>.*?套内面积</span>(.*?)</li>.*?建筑类型</span>(.*?)</li>.*?房屋朝向</span>(.*?)</li>.*?建成年代</span>(.*?)</li>.*?装修情况</span>(.*?)</li>.*?建筑结构</span>(.*?)</li>.*?供暖方式</span>(.*?)</li>.*?梯户比例</span>(.*?)</li>.*?产权年限</span>(.*?)</li>.*?配备电梯</span>(.*?)</li>'
        transinfos='链家编号</span>(.*?)</li>.*?交易权属</span>(.*?)</li>.*?挂牌时间</span>(.*?)</li>.*?房屋用途</span>(.*?)</li>.*?房屋年限</span>(.*?)</li>.*?房权所属</span>(.*?)</li>'
        record='record_list.*?record_price".*?>(.*?)万.*?record_detail">单价(.*?)元/平,(.*?)成交</p>' # 成交记录：成交价格 成交单价 成交时间
        tag='tag is_near_subway'
        subway='1' if html.find(tag)>0 else '0'
        xiaoqu=re.findall("resblockName:'(.*?)'",html)
        houseinfo=[]
        for r in [msg,baseinfos,transinfos,record,position]:
            items=re.findall(re.compile(r,re.S),html)
            for item in items:
                houseinfo.extend([i.strip() for i in item])
        houseinfo.extend(xiaoqu)
        houseinfo.extend(subway)
        print(houseinfo)       
        houseinfos.append(houseinfo)
    except:
        wrongid.append(j)
        pass
def importid():
    file = open(r"C:\Users\MaYuling\Desktop\houseid.txt","r",encoding='UTF-8')
    for ln in file:
        houseid.extend(ln.strip('["').strip('"]').split('", "'))
def main(i):
        url='https://cd.lianjia.com/chengjiao/'+i+'.html'
        print(url)
        html=get_one_page(url)
        parse_one_page(html)

if __name__ == '__main__':
    houseid=[]
    wrongid=[]
    houseinfos=[]
    importid()
    k=1    
    for j in houseid:
        print('第'+str(k)+'条数据')
        main(j)
        time.sleep(0.01)
        k=k+1
houseinfoss=pd.DataFrame(houseinfos)
houseinfoss.columns=['挂牌价格','成交周期','调价次数','带看次数','关注人数','浏览次数','房屋户型','所在楼层','建筑面积','户型结构','套内面积','建筑类型','房屋朝向','建成年代','装修情况', '建筑结构','供暖方式','梯户比例','产权年限','配备电梯','链家编号','交易权属','挂牌时间','房屋用途','房屋年限','房权所属','成交价格','单位价格','成交时间','区域1','区域2','区域3','近地铁']
houseinfoss.to_csv(r"C:\Users\MaYuling\Desktop\lianjia_uncleardata.csv", encoding='utf_8_sig',index=None)
wrongid = list(set(wrongid))

